### Plan

Phase 1 

1. Take part of the data - 7 million? (5 mil train and 1 mil valid and 1 mil test)?
2. Create features
3. Categorical/Numerical variables
4. Create embedding sizes from categorical variables
5. Make predictions on the original test data using this model trained on just 5 million rows

### Read Libraries

In [1]:
import pandas as pd

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
import gc
import pdb
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
import tqdm

### Read data path

In [2]:
path = '/home/paperspace/data/talkingdata/'

### Read train data and write it to train_processed

In [3]:
train = pd.read_csv(path+'train.csv.zip',compression='zip',low_memory=True,usecols=['ip','app','device','os','channel','click_time','is_attributed'])

In [4]:
train.shape

(184903890, 7)

In [5]:
train.reset_index(drop=True,inplace=True)

In [6]:
train.head(5)

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [7]:
train['click_time'] = pd.to_datetime(train['click_time'])

In [8]:
train['hour'] = train['click_time'].dt.hour
train['minute'] = train['click_time'].dt.minute

In [None]:
train.to_csv(path+"train_processed.csv",index=False)
del train
gc.collect()


### Read train_processed

In [3]:
train_processed = pd.read_csv(path+"train_processed.zip",compression='zip',low_memory=True,usecols=['ip','app','device','os','channel','hour','minute'])

In [4]:
y_train = pd.read_csv(path+"train_processed.zip",compression='zip',low_memory=True,usecols=['is_attributed'])

In [5]:
y_train = np.ravel(y_train['is_attributed'].values)

In [6]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

### Read test data

In [7]:
test = pd.read_csv(path+'test.csv.zip',compression='zip',low_memory=True,usecols=\
                    ['ip','app','device','os','channel','click_time'])

In [8]:
test.shape

(18790469, 6)

In [9]:
test.head(5)

Unnamed: 0,ip,app,device,os,channel,click_time
0,5744,9,1,3,107,2017-11-10 04:00:00
1,119901,9,1,3,466,2017-11-10 04:00:00
2,72287,21,1,19,128,2017-11-10 04:00:00
3,78477,15,1,13,111,2017-11-10 04:00:00
4,123080,12,1,13,328,2017-11-10 04:00:00


In [10]:
test.reset_index(drop=True,inplace=True)

In [11]:
test['click_time'] = pd.to_datetime(test['click_time'])

In [12]:
test['hour'] = test['click_time'].dt.hour
test['minute'] = test['click_time'].dt.minute

In [13]:
test.drop(columns='click_time',inplace=True)

### Merge and transform train and test data

In [16]:
def proc_col(col):
        uniq = col.unique()
        name2idx = {o:i for i,o in enumerate(uniq)}
        return (uniq, name2idx, np.array([name2idx[x] for x in col]), len(uniq))

In [17]:
cat_vars = ['ip','app','device','os','channel','hour','minute']

In [18]:
for i in cat_vars:
    col_tot = pd.concat([train_processed[i],test[i]],axis=0)
    _,_,col_tot,_ = proc_col(col_tot)
    train_processed[i] = col_tot[range(len(train_processed.index))]
    test[i] = col_tot[len(train_processed.index):]
del col_tot
gc.collect()

189

In [19]:
train_processed.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute
0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,1
2,2,0,0,0,0,0,2
3,3,1,0,0,1,0,2
4,4,0,0,0,0,0,3


In [20]:
test.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute
0,51203,16,0,7,34,14,53
1,1357,16,0,7,44,14,53
2,7583,10,0,1,7,14,53
3,4438,14,0,0,90,14,53
4,23741,15,0,0,56,14,53


In [21]:
train_processed.to_csv("train_indexed.csv",compression='gzip',chunksize=1000000,index=False)

SystemError: <built-in method item of numpy.ndarray object at 0x7febb403ce40> returned a result with an error set

In [None]:
test.to_csv("test.csv",compression='gzip',chunksize=1000000,index=False)

In [22]:
for i in cat_vars: 
    col_tot = pd.concat([train_processed[i],test[i]],axis=0)
    col_tot = col_tot.astype('category').cat.as_ordered().copy()
    train_processed[i] = col_tot.loc[range(len(train_processed.index)),].copy()
    test[i] = col_tot.loc[-range(len(train_processed.index)),].copy()

MemoryError: 

In [11]:
test.index.max()

18790468

In [12]:
test.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute
0,5744,9,1,3,107,4,0
1,119901,9,1,3,466,4,0
2,72287,21,1,19,128,4,0
3,78477,15,1,13,111,4,0
4,123080,12,1,13,328,4,0


In [18]:
gc.collect()

2681

In [28]:
train['train']=1

In [31]:
test['train']=0

In [43]:
name2idx =[]
for v in cat_vars:
    _,temp,train_test_merge[v],_ = proc_col(train_test_merge[v])
    name2idx.append(temp)

NameError: name 'proc_col' is not defined

In [103]:
for v in cat_vars: 
    train_test_merge[v] = train_test_merge[v].astype('category').cat.as_ordered().copy()

In [68]:
max(ip_new)

61521

In [69]:
len(set(proc_train['ip']))

61522

In [58]:
max(proc_train['ip'])

364778

In [75]:
ip_df = pd.Series(ip_new)

In [76]:
ip_df = ip_df.astype('category').cat.as_ordered().copy()

In [77]:
len(ip_df.cat.categories)

61522

In [78]:
len(set(df['ip']))

62805

In [104]:
cat_sz = [(c, len(proc_train[c].cat.categories)+1) for c in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [105]:
cat_sz

[('ip', 252998),
 ('app', 327),
 ('device', 1884),
 ('os', 185),
 ('channel', 180),
 ('hour', 25),
 ('minute', 61)]

In [106]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [107]:
emb_szs

[(252998, 50), (327, 50), (1884, 50), (185, 50), (180, 50), (25, 13), (61, 31)]

In [108]:
train_test_merge.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
0,0,0,0,0,0,0,0,1
1,1,1,0,1,1,1,1,1
2,2,0,0,0,0,1,1,1
3,3,0,0,2,2,1,1,1
4,4,0,0,0,2,1,1,1


In [109]:
train_cat = train_test_merge[train_test_merge['train']==1].copy()

test_cat = train_test_merge[train_test_merge['train']==0].copy()

In [None]:
train_cat.drop(columns='train',inplace=True)

In [116]:
test_cat.drop(columns='train',inplace=True)

In [110]:
del train_test_merge
gc.collect()

2673

In [111]:
train_cat.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
0,0,0,0,0,0,0,0,1
1,1,1,0,1,1,1,1,1
2,2,0,0,0,0,1,1,1
3,3,0,0,2,2,1,1,1
4,4,0,0,0,2,1,1,1


In [112]:
test_cat.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
0,66600,1,0,26,27,13,1,0
1,29623,1,0,26,57,13,1,0
2,8141,26,0,6,21,13,1,0
3,18658,45,0,0,104,13,1,0
4,219653,46,0,0,71,13,1,0


In [117]:
proc_train = train_cat.loc[list(train_idx),:].copy()

In [124]:
del proc_train
gc.collect()

450

In [118]:
proc_train.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute
0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,1
2,2,0,0,0,0,1,1
3,3,0,0,2,2,1,1
4,4,0,0,0,2,1,1


In [120]:
test_cat.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute
0,66600,1,0,26,27,13,1
1,29623,1,0,26,57,13,1
2,8141,26,0,6,21,13,1
3,18658,45,0,0,104,13,1
4,219653,46,0,0,71,13,1


In [32]:
df_model = proc_train.drop(columns='is_attributed')

In [34]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [35]:
df_model.shape

(6300000, 13)

In [36]:
df_model_test = df.loc[test_idx,:].copy()

### Jeremy's class

In [None]:
train_cat.to_csv("train_cat.csv",index=True)
test_cat.to_csv("test_cat.csv",index=True)

In [3]:
train_cat= pd.read_csv("train_cat.csv",index_col=0)
test_cat = pd.read_csv("test_cat.csv",index_col=0)
y_train= pd.read_csv("y_train.csv",index_col=0,header=None)

  mask |= (ar1 == a)


In [4]:
y_train = np.ravel(y_train.values)
y_train[:5]

array([1, 1, 1, 1, 1])

In [5]:
train_cat['train']=1
test_cat['train']=0
train_test_cat_merged = pd.concat([train_cat,test_cat],axis=0)
train_test_cat_merged.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
0,0,0,0,0,0,0,0,1
1,1,1,0,1,1,1,1,1
2,2,0,0,0,0,1,1,1
3,3,0,0,2,2,1,1,1
4,4,0,0,0,2,1,1,1


In [6]:
train_test_cat_merged.tail(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
18790464,59674,1,0,0,163,0,1,0
18790465,17248,102,0,38,33,0,1,0
18790466,31171,24,0,10,76,0,1,0
18790467,173046,7,0,0,8,0,1,0
18790468,215,46,28,37,76,0,1,0


In [7]:
cat_vars = ['ip','app','device','os','channel','hour','minute']

def proc_col(col):
        uniq = col.unique()
        name2idx = {o:i for i,o in enumerate(uniq)}
        return (uniq, name2idx, np.array([name2idx[x] for x in col]), len(uniq))

name2idx =[]
for v in cat_vars:
    _,temp,train_test_cat_merged[v],_ = proc_col(train_test_cat_merged[v])
    name2idx.append(temp)

for v in train_test_cat_merged.columns:
    train_test_cat_merged[v] = train_test_cat_merged[v].astype('category').cat.as_ordered().copy()

train_cat = train_test_cat_merged[train_test_cat_merged['train']==1].copy()
test_cat = train_test_cat_merged[train_test_cat_merged['train']==0].copy()

del train_test_cat_merged
gc.collect()

72

In [8]:
train_cat.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
0,0,0,0,0,0,0,0,1
1,1,1,0,1,1,1,1,1
2,2,0,0,0,0,1,1,1
3,3,0,0,2,2,1,1,1
4,4,0,0,0,2,1,1,1


In [9]:
test_cat.head(5)

Unnamed: 0,ip,app,device,os,channel,hour,minute,train
0,66600,1,0,26,27,13,1,0
1,29623,1,0,26,57,13,1,0
2,8141,26,0,6,21,13,1,0
3,18658,45,0,0,104,13,1,0
4,219653,46,0,0,71,13,1,0


In [10]:
y_train[:5]

array([1, 1, 1, 1, 1])

In [19]:
random.seed(42)
val_idx = random.sample(range(len(train_cat.index)),100000)

In [20]:
sum(y_train[val_idx])

49991

In [11]:
#val_idx = range(round(train_cat.shape[0]*0.9),round(train_cat.shape[0]*0.98)+round(train_cat.shape[0]*0.02))

In [21]:
cat_sz = [(c, len(train_cat[c].cat.categories)+1) for c in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [22]:
md = ColumnarModelData.from_data_frame(path, val_idx, train_cat, y_train.astype(np.int32), cat_flds=cat_vars, bs=128,is_reg=False,test_df=test_cat)

In [23]:
m = md.get_learner(emb_szs, len(train_cat.columns)-len(cat_vars),0.04, 2, [1000,500], [0.001,0.01])
lr = 1e-6

In [24]:
#pdb.set_trace()
m.fit(lr, 5,metrics=[accuracy])

epoch      trn_loss   val_loss   accuracy                      
    0      0.434633   0.433134   0.882619  
    1      0.262337   0.266768   0.90242                       
    2      0.257302   0.248825   0.905499                      
    3      0.239157   0.242465   0.907862                      
    4      0.243922   0.238711   0.909063                      



[0.2387112692341475, 0.9090632992937132]

In [None]:
# Run for 3 epochs
m.fit(lr, 3,metrics=[accuracy])

epoch      trn_loss   val_loss   accuracy                      
    0      0.251926   0.213918   0.962236  
 43%|████▎     | 3030/6996 [00:31<00:41, 94.90it/s, loss=0.252]

In [56]:
# Run for 3 epochs
m.fit(lr, 3,metrics=[accuracy])

epoch      trn_loss   val_loss   accuracy                      
    0      0.231756   0.195496   0.965274  
    1      0.231021   0.192207   0.965763                      
    2      0.235816   0.194168   0.964189                      



[0.1941681206226349, 0.9641893870300717]

In [25]:
pred_test = m.predict(True)

In [26]:
preds_0_1 = [np.argmax(i) for i in pred_test]

In [27]:
sum(preds_0_1)

880966

In [29]:
#Read sample_submission

sample_submission = pd.read_csv(path+'sample_submission.csv.zip',compression='zip',low_memory=True)

sample_submission.shape

sample_submission.head(5)

Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [30]:
sample_submission['is_attributed'] = preds_0_1

In [31]:
sample_submission.to_csv(path+"third_submission.csv",index=False)

In [None]:
### Add metrics to fit

m.fit(lr, 1,metrics=[accuracy])

In [32]:
val_preds = m.predict(is_test=False)

In [34]:
val_preds_0_1 = [np.argmax(i) for i in val_preds]

In [36]:
y_val = y_train[val_idx]

In [46]:
confusion_matrix(y_val,np.array(val_preds_0_1),labels=[1,0])

array([[22118, 27873],
       [22243, 27766]])

In [43]:
y_val

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, ..., 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0])

### Other Code

In [47]:
for i in enumerate([10,20,30]):
    print(i)

(0, 10)
(1, 20)
(2, 30)


In [4]:
train = pd.read_csv(path+'train.csv')

In [5]:
train.head(5)

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value
0,744519,1,2014-09-03 00:00:00,1,909655.5
1,7627564,1,2014-09-04 00:00:00,1,1748273.0
2,7034705,1,2014-09-05 00:00:00,1,
3,5995486,1,2014-09-06 00:00:00,1,
4,7326510,1,2014-09-07 00:00:00,1,


In [6]:
train.shape

(6559830, 5)

In [81]:
submission_format = pd.read_csv(path+'submission_format.csv')

In [8]:
submission_format.head(5)

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value
0,1677832,1,2015-08-29 00:00:00,1,0.0
1,5379616,1,2015-08-30 00:00:00,1,0.0
2,496261,1,2015-08-31 00:00:00,1,0.0
3,4567147,1,2015-09-01 00:00:00,1,0.0
4,3684873,1,2015-09-02 00:00:00,1,0.0


In [9]:
submission_format.shape

(1309176, 5)

In [10]:
submission_frequency = pd.read_csv(path+'submission_frequency.csv')

In [11]:
submission_frequency.head(5)

Unnamed: 0,ForecastId,ForecastPeriodNS
0,1,86400000000000
1,2,86400000000000
2,3,86400000000000
3,4,86400000000000
4,5,3600000000000


In [12]:
submission_frequency.shape

(6974, 2)

In [13]:
weather = pd.read_csv(path+'weather.csv')

In [14]:
weather.head(5)

Unnamed: 0.1,Unnamed: 0,Timestamp,Temperature,Distance,SiteId
0,78064,2013-12-31 19:00:00,-7.2,24.889929,1
1,86746,2013-12-31 19:00:00,-8.3,23.303097,1
2,90002,2013-12-31 19:00:00,-7.8,20.952256,1
3,90003,2013-12-31 19:00:00,-8.0,20.952256,1
4,100541,2013-12-31 19:34:00,-8.1,16.610602,1


In [15]:
weather.shape

(20017278, 5)

In [16]:
holidays = pd.read_csv(path+'holidays.csv')

In [17]:
holidays.head(5)

Unnamed: 0.1,Unnamed: 0,Date,Holiday,SiteId
0,0,2016-01-01,New year,1
1,1,2016-01-18,"Birthday of Martin Luther King, Jr.",1
2,2,2016-02-15,Washington's Birthday,1
3,3,2016-05-30,Memorial Day,1
4,4,2016-07-04,Independence Day,1


In [18]:
holidays.shape

(8387, 4)

In [19]:
metadata = pd.read_csv(path+'metadata.csv')

In [20]:
metadata.head(5)

Unnamed: 0,SiteId,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff
0,1,1387.205119,15.0,18.0,False,False,False,False,False,True,True
1,2,6098.278376,30.0,18.0,False,False,False,False,False,True,True
2,3,10556.293605,5.0,18.0,False,False,False,False,False,True,False
3,5,12541.181277,30.0,18.0,False,False,False,False,False,True,True
4,6,9150.195373,30.0,18.0,False,False,False,False,False,True,True


In [21]:
metadata.shape

(267, 11)

In [22]:
### merge train and test sets to perform operations

In [23]:
train['train']=1

In [24]:
submission_format['train']=0

In [25]:
tt_merge = pd.concat([train,submission_format],axis=0)

In [26]:
tt_merge.shape

(7869006, 6)

In [27]:
tt_merge_forecastid = tt_merge.merge(submission_frequency,how='left',left_on='ForecastId',right_on='ForecastId',suffixes=('','_y'))

In [28]:
tt_merge_forecastid['Timestamp'] = pd.to_datetime(tt_merge_forecastid['Timestamp'])

In [29]:
tt_merge_forecastid['year'] = tt_merge_forecastid['Timestamp'].dt.year
tt_merge_forecastid['month'] = tt_merge_forecastid['Timestamp'].dt.month
tt_merge_forecastid['day'] = tt_merge_forecastid['Timestamp'].dt.day
tt_merge_forecastid['hour'] = tt_merge_forecastid['Timestamp'].dt.hour
tt_merge_forecastid['minute'] = tt_merge_forecastid['Timestamp'].dt.minute
tt_merge_forecastid['sec'] = tt_merge_forecastid['Timestamp'].dt.second

In [30]:
weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])

In [31]:
weather['year'] = weather['Timestamp'].dt.year
weather['month'] = weather['Timestamp'].dt.month
weather['day'] = weather['Timestamp'].dt.day
weather['hour'] = weather['Timestamp'].dt.hour
weather['minute'] = weather['Timestamp'].dt.minute
weather['sec'] = weather['Timestamp'].dt.second


In [32]:
weather.head(5)

Unnamed: 0.1,Unnamed: 0,Timestamp,Temperature,Distance,SiteId,year,month,day,hour,minute,sec
0,78064,2013-12-31 19:00:00,-7.2,24.889929,1,2013,12,31,19,0,0
1,86746,2013-12-31 19:00:00,-8.3,23.303097,1,2013,12,31,19,0,0
2,90002,2013-12-31 19:00:00,-7.8,20.952256,1,2013,12,31,19,0,0
3,90003,2013-12-31 19:00:00,-8.0,20.952256,1,2013,12,31,19,0,0
4,100541,2013-12-31 19:34:00,-8.1,16.610602,1,2013,12,31,19,34,0


In [33]:
weather_grouped = weather.groupby(['SiteId','year','month','day']).agg({'Temperature':'mean','Distance':'mean'}).reset_index()

In [34]:
tt_weather = tt_merge_forecastid.merge(weather_grouped,how='left',left_on=['SiteId','year','month','day'],right_on=['SiteId','year','month','day'],suffixes=("","_y"))

In [35]:
tt_weather[tt_weather['train']==0].shape

(1309176, 15)

In [36]:
tt_weather['date'] = tt_weather['Timestamp'].dt.date

In [37]:
holidays['Date'] = pd.to_datetime(holidays['Date']).copy()

In [38]:
holidays['year'] = holidays['Date'].dt.year
holidays['month'] = holidays['Date'].dt.month
holidays['day'] = holidays['Date'].dt.day


In [39]:
hols_grouped = holidays.groupby(['SiteId','Date']).agg({'Holiday':'first','year':'first','month':'first','day':'first'}).reset_index()

In [40]:
tt_wtr_hol = tt_weather.merge(hols_grouped,how='left',left_on=['SiteId','year','month','day'],right_on=['SiteId','year','month','day'],suffixes=("","_y"))

In [41]:
tt_wtr_hol[tt_wtr_hol['train']==0].shape

(1309176, 18)

In [42]:
tt_wtr_hol_md = tt_wtr_hol.merge(metadata,how='left',left_on='SiteId',right_on='SiteId',suffixes=("","_y"))

In [43]:
tt_wtr_hol_md['dayofweek'] = tt_wtr_hol_md['Timestamp'].dt.dayofweek
tt_wtr_hol_md['dayofyear'] = tt_wtr_hol_md['Timestamp'].dt.dayofyear


In [44]:
tt_wtr_hol_md.head(5)

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,train,ForecastPeriodNS,year,month,day,...,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff,dayofweek,dayofyear
0,744519,1,2014-09-03,1,909655.5,1,86400000000000,2014,9,3,...,18.0,False,False,False,False,False,True,True,2,246
1,7627564,1,2014-09-04,1,1748273.0,1,86400000000000,2014,9,4,...,18.0,False,False,False,False,False,True,True,3,247
2,7034705,1,2014-09-05,1,,1,86400000000000,2014,9,5,...,18.0,False,False,False,False,False,True,True,4,248
3,5995486,1,2014-09-06,1,,1,86400000000000,2014,9,6,...,18.0,False,False,False,False,False,True,True,5,249
4,7326510,1,2014-09-07,1,,1,86400000000000,2014,9,7,...,18.0,False,False,False,False,False,True,True,6,250


In [45]:
tt_wtr_hol_md['IsHoliday'] = tt_wtr_hol_md['Holiday'].apply(lambda x:0 if pd.isnull(x) else 1)

In [46]:
tt_wtr_hol_md.columns

Index(['obs_id', 'SiteId', 'Timestamp', 'ForecastId', 'Value', 'train',
       'ForecastPeriodNS', 'year', 'month', 'day', 'hour', 'minute', 'sec',
       'Temperature', 'Distance', 'date', 'Date', 'Holiday', 'Surface',
       'Sampling', 'BaseTemperature', 'MondayIsDayOff', 'TuesdayIsDayOff',
       'WednesdayIsDayOff', 'ThursdayIsDayOff', 'FridayIsDayOff',
       'SaturdayIsDayOff', 'SundayIsDayOff', 'dayofweek', 'dayofyear',
       'IsHoliday'],
      dtype='object')

In [47]:
cat_vars = ['SiteId', 'year', 'month', 'day', 'hour', 'minute', 'sec','Holiday','MondayIsDayOff',
       'TuesdayIsDayOff', 'WednesdayIsDayOff', 'ThursdayIsDayOff',
       'FridayIsDayOff', 'SaturdayIsDayOff', 'SundayIsDayOff', 'dayofweek',
       'dayofyear', 'IsHoliday']
contin_vars = ['ForecastPeriodNS','Temperature', 'Distance','Surface', 'Sampling', 'BaseTemperature']

In [48]:
proc_train = tt_wtr_hol_md[tt_wtr_hol_md['train']==1].copy()
proc_test = tt_wtr_hol_md[tt_wtr_hol_md['train']==0].copy()

In [49]:
proc_train.index = proc_train['date']

In [50]:
proc_test.index = proc_test['date']

In [51]:
proc_test.shape

(1309176, 31)

In [52]:
proc_train = proc_train[-((proc_train['Value'].isnull())|(proc_train['Value']==0))].copy()

In [53]:
proc_train.shape

(6472634, 31)

In [54]:
for v in cat_vars: 
    proc_train[v] = proc_train[v].astype('category').cat.as_ordered().copy()
    proc_test[v] = proc_test[v].astype('category').cat.as_ordered().copy()

In [55]:
for v in contin_vars:
    proc_train[v] = proc_train[v].astype('float32')
    proc_test[v] = proc_test[v].astype('float32')

In [56]:
proc_train = proc_train[cat_vars+contin_vars+['Value']]
proc_test = proc_test[cat_vars+contin_vars+['Value']]