In [166]:
import pandas as pd
import numpy as np
from datetime import timedelta

import matplotlib.pyplot as plt

import evaluation
import make_prediction as mp

In [167]:
df= pd.read_csv('../data/training.csv', parse_dates= True)

### Create an external set - DO NOT CHANGE 

In [70]:
def create_prediction_set(df):
    external_set=df.sample(frac=0.1,random_state=42)
    external_set['row_id']=external_set.reset_index().index
    train_set=df.drop(external_set.index)
    return train_set, external_set

In [71]:
train_set,external_set=create_prediction_set(df)

In [97]:
external_set.shape

(395535, 49)

In [24]:
# creating difference features 

#sample['pct_change']=sample['sensor_t4'].diff()

def pct_change(variable,df):
    for var in variable:
        if var in df.columns:
            df[var+'_pct_change']=df[var].pct_change()
        else:
            print('Column does not exist')

In [25]:
slist =['sensor_t4','sensor_t1','sensor_t5','sensor_q1','sensor_s4','sensor_t6','sensor_q2',
       'sensor_t9','sensor_s3','sensor_t2','sensor_t7','sensor_s1','sensor_t3','sensor_s2','sensor_t8']

s=pct_change(slist,train_set)

In [73]:
# run for holdout st 
t=pct_change(slist,external_set)

In [74]:
external_set.shape

(395535, 49)

### Some products are missing process 
#### Like s50016128 , this product has only gone throught process B and C 

### Aggregration

In [9]:
def std(x):
    f=np.std(x)
    return f

def mean(x):
    f=np.mean(x)
    return f

def entropy(s):
    px = s.value_counts() / s.shape[0]
    lpx = np.log2(px)
    ent = -1.0*(px*lpx).sum()
    return ent 

##### Numerical data from sensors

In [12]:
aggregrations ={
                'sensor_t4':[min,max,mean,std,entropy],
                'sensor_t1':[min,max,mean,std,entropy],
                'sensor_t5':[min,max,mean,std,entropy],
                'sensor_q1':[min,max,mean,std,entropy],
                'sensor_s4':[min,max,mean,std,entropy],
                'sensor_t6':[min,max,mean,std,entropy],
                'sensor_s3':[min,max,mean,std,entropy],
                'sensor_q2':[min,max,mean,std,entropy],
                'sensor_t9':[min,max,mean,std,entropy],
                'sensor_t2':[min,max,mean,std,entropy],
                'sensor_s3':[min,max,mean,std,entropy],
                'sensor_t7':[min,max,mean,std,entropy],
                'sensor_s2':[min,max,mean,std,entropy],
                'sensor_t1':[min,max,mean,std,entropy],
                'sensor_t3':[min,max,mean,std,entropy],
                'sensor_s2':[min,max,mean,std,entropy],
                'sensor_t8':[min,max,mean,std,entropy],
                'timestamp':[min,max],
                'flow_id':['last'],
                'lot_id':['last'],
                'flag_b2':[entropy,mean],
                'flag_c2':[entropy,mean],
                'flag_e': [entropy,mean],
                'flag_a2':[entropy,mean],
                'flag_a4':[entropy,mean],
                'flag_b1':[entropy,mean],
                'flag_d': [entropy,mean],
                'flag_a1':[entropy,mean],
                'flag_b3':[entropy,mean],
                'flag_b4':[entropy,mean],
                'flag_c1':[entropy,mean],
                'flag_a3':[entropy,mean],
                'flag_a5':[entropy,mean],
                'flag_b2':['last',mean],
                'flag_c2':['last',mean],
                'flag_e':['last',mean],
                'flag_a2':['last',mean],
                'flag_a4':['last',mean],
                'flag_b1':['last',mean],
                'flag_d':['last',mean],
                'flag_a1':['last',mean],
                'flag_b3':['last',mean],
                'flag_b4':['last',mean],
                'flag_c1':['last',mean],
                'flag_a3':['last',mean],
                'flag_a5':['last',mean],
                'sensor_t4_pct_change':[min,max],
                'sensor_t1_pct_change':[min,max],
                'sensor_t5_pct_change':[min,max],
                'sensor_q1_pct_change':[min,max],
                'sensor_s4_pct_change':[min,max],
                'sensor_t6_pct_change':[min,max],
                'sensor_s3_pct_change':[min,max],
                'sensor_q2_pct_change':[min,max],
                'sensor_t9_pct_change':[min,max],
                'sensor_t2_pct_change':[min,max],
                'sensor_s3_pct_change':[min,max],
                'sensor_t7_pct_change':[min,max],
                'sensor_s2_pct_change':[min,max],
                'sensor_t1_pct_change':[min,max],
                'sensor_t3_pct_change':[min,max],
                'sensor_s2_pct_change':[min,max],
                'sensor_t8_pct_change':[min,max]
                
  }

In [None]:
plt.hist(train_set['process'],density=True, facecolor='g', alpha=0.75)

In [26]:
group_obj=train_set.groupby(['product_id','process'])

In [75]:
# external dataset HOLDOUT 

group_obj_ext=external_set.groupby(['product_id','process'])

In [None]:
#sample
sample_group = sample.groupby(['product_id','process'])

In [76]:
# apply aggregation functions
'''change the groupby object to full df'''
f=group_obj_ext.agg(aggregrations)

# renaming columns with multiIndex levels 
f.columns = ["_".join(x) for x in f.columns.ravel()]
#unstacking the agg features to create separate feature for each process
funstack=f.unstack(level=1)
# Renaming these features 
funstack.columns = ["_".join(x) for x in funstack.columns.ravel()]
#flattening the index 
funstack.reset_index(inplace=True)

In [77]:
flat= funstack

In [78]:
flat.shape

(4404, 513)

#### adding time duration 
##### count total seconds spent on each process 
######tot_seconds_proccessA = 290
######tot_seconds_proccessB = 456
sample['timestamp_seconds']=sample.loc['timestamp'].dt.second.cumsum()

In [79]:
# converting string to datetime to calculate time spent
timestamplist =['timestamp_max_A','timestamp_min_A','timestamp_max_B','timestamp_min_B','timestamp_max_C',
                 'timestamp_min_C','timestamp_max_D','timestamp_min_D']

# Fill the Missing timestamps as 0 
## Assumption: If no time was spent on the stage then it is meaningful 0


In [80]:
#funstack.loc[:,timestamplist].apply(pd.to_datetime)
for timecolumn in timestamplist:
    flat[timecolumn]= flat[timecolumn].apply(lambda x : pd.to_datetime(x))

In [81]:
# Fill the Missing timestamps as 0 
## Assumption: If no time was spent on the stage then it is meaningful 0
flat['timeSpent_A'] = (flat['timestamp_max_A'] - flat['timestamp_min_A']).fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
flat['timeSpent_B'] = (flat['timestamp_max_B'] - flat['timestamp_min_B']).fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
flat['timeSpent_C'] = (flat['timestamp_max_C'] - flat['timestamp_min_C']).fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')
flat['timeSpent_D'] = (flat['timestamp_max_D'] - flat['timestamp_min_D']).fillna(pd.Timedelta(seconds=0))/np.timedelta64(1, 's')

In [82]:
flat.fillna(0,inplace=True)

## Joining the Target label

In [37]:
labels = pd.read_csv('../data/training_label.csv')

In [83]:
merged= flat.merge(labels, left_on='product_id', right_on='product_id', how='left')

## Log transformation 

In [None]:
merged['log_target']= np.log(merged['qc_reading'])

In [None]:
merged.info()

## DataRobot - Project building

In [41]:
import datarobot as dr

In [45]:
dr.Client(token='NWVlYWVmNzI4YjcxY2IyODE2ZTJhOTg5OnhSdjlqSEZ2NlU5cHc4WWQwZlh1RHNkbGZGREE0dlQzQnB5L3k2V2d3a2c9', endpoint='https://app.datarobot.com/api/v2/')

<datarobot.rest.RESTClientObject at 0x16b63f7f0>

In [47]:
project = dr.Project.start(sourcedata=merged,
                           target='qc_reading',
                           project_name='Exec_train_pred_v10',worker_count=-1)
print('Project ID: {}'.format(project.id))

Project ID: 5f2510f06c6a7a108d8d88ae


## prepare the external holdout set 

run the preprocessing steps

In [84]:
holdout_data= merged

In [141]:
#load the project
proj=dr.Project.get('5f2533716c6a7a14d6aff706')
# chose the model 
model = dr.Model.get(project=proj.id,
                     model_id='5f253a83984a4326a8d7f66c')
#upload the external dataset 
#external= proj.upload_dataset(holdout_data)

In [142]:
proj_id=proj.id
model_id=model.id
ext_id= external.id

In [143]:
def get_predictions_(project_id,model_id,ext_id):
    #upload external set
    ## Check if there is an external set already. If yes, then get the dataset.id instead of uploading a new one 
    model = dr.Model.get(project=proj_id,
                     model_id=model_id)
    predict_job=model.request_predictions(ext_id)
    predict_job.wait_for_completion()
    #get predictions
    predictions = predict_job.get_result_when_complete()
    predictions['exp_predictions'] = np.exp(predictions['prediction'])
    predictions['row_id']= predictions.reset_index().index
    return predictions

In [144]:
def join_results(predictions,actual_df):
    result=actual_df.merge(predictions,left_on='row_id',right_on='row_id', how='left')
    labels = pd.read_csv('../data/training_label.csv')
    holdout_with_target= result.merge(labels, left_on='product_id', right_on='product_id', how='left')
    return amape(holdout_with_target['qc_reading'],holdout_with_target['prediction'])

In [147]:
r =get_predictions_(proj_id,model_id,ext_id)

In [148]:
score = join_results(r,external_set)
score

0.6696

In [None]:
predictions['exp_predictions'] = np.exp(predictions['prediction'])

## Evaluation 


#### Beat 0.121 score 

In [92]:
def amape(actual,pred):
    #actual, pred = np.array(actual), np.array(pred)
    for a in actual:
        if a < 2000000:
            actual[a] = 2000000
            #mape =np.round(np.mean(np.abs((actual - pred) / actual)),4)
        else:
            None
            mape =np.round(np.mean(np.abs((actual - pred) / actual)),4)
            return mape

#def mape(actual, pred): 
 #   actual, pred = np.array(actual), np.array(pred)
  #  return np.mean(np.abs((actual - pred) / actual)) * 100