In [2]:
import pandas as pd 
import numpy as np 
import xgboost as xgb
from scipy.optimize import fmin_powell


In [31]:
from ml_metrics import quadratic_weighted_kappa

In [28]:
def get_params():
    
    params = {}
    params["objective"] = "reg:linear"     
    params["eta"] = 0.05
    params["min_child_weight"] = 360
    params["subsample"] = 0.85
    params["colsample_bytree"] = 0.3
    params["silent"] = 1
    params["max_depth"] = 7
    plst = list(params.items())

    return plst

In [32]:
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

In [36]:
def apply_offsets(data, offsets):
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j]
    return data

In [3]:
# global variables
columns_to_drop = ['Id', 'Response'] #, 'Medical_History_10','Medical_History_24']
xgb_num_rounds = 720
num_classes = 8
missing_indicator = -1000

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
all_data = train.append(test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [9]:
# create any new variables    
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]


In [15]:
# factorize categorical variables
all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
all_data['Product_Info_2_char'] = pd.factorize(all_data['Product_Info_2_char'])[0]
all_data['Product_Info_2_num'] = pd.factorize(all_data['Product_Info_2_num'])[0]

In [18]:
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']

In [20]:
med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)

In [23]:
print('Eliminate missing values')    
all_data.fillna(missing_indicator, inplace=True)

Eliminate missing values


In [24]:
# fix the dtype on the label column
all_data['Response'] = all_data['Response'].astype(int)

In [25]:
# split train and test
train = all_data[all_data['Response']>0].copy()
test = all_data[all_data['Response']<1].copy()

In [26]:
# convert data to xgb data structure
xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), train['Response'].values, 
                        missing=missing_indicator)
xgtest = xgb.DMatrix(test.drop(columns_to_drop, axis=1), label=test['Response'].values, 
                        missing=missing_indicator)

In [29]:
# get the parameters for xgboost
plst = get_params()
print(plst) 

[('objective', 'reg:linear'), ('eta', 0.05), ('min_child_weight', 360), ('subsample', 0.85), ('colsample_bytree', 0.3), ('silent', 1), ('max_depth', 7)]


In [30]:
# train model
model = xgb.train(plst, xgtrain, xgb_num_rounds) 

In [33]:
# get preds
train_preds = model.predict(xgtrain, ntree_limit=model.best_iteration)
print('Train score is:', eval_wrapper(train_preds, train['Response'])) 
test_preds = model.predict(xgtest, ntree_limit=model.best_iteration)

Train score is: 0.6515752806586164


In [78]:
#offsets = np.array([0.1, -1.1, -2.2, -1, -0.8, 0.02, 0.8, 1.1])

offsets = np.array([-2.9, -2.08, -0.22, -0.06, 0.31, 0.48, 0.81, 0.94])

In [79]:
# apply offsets to test
data = np.vstack((test_preds, test_preds, test['Response'].values))
data = apply_offsets(data, offsets)


In [80]:
final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)

#final_test_preds = np.round(np.clip(test_preds, 1, 8)).astype(int)

In [81]:
preds_out = pd.DataFrame({"Id": test['Id'].values, "Response": final_test_preds})
preds_out = preds_out.set_index('Id')
preds_out.to_csv('submit5.csv')

In [77]:
# offset calculation

In [73]:
x = train_preds-train['Response']
y = pd.DataFrame(data = {'Residual':x,'Response':train['Response']})

In [76]:
list1=[1,2,3,4,5,6,7,8]
for i in list1:
    print(y[y.Response==i].mean())

Residual    2.92967
Response    1.00000
dtype: float64
Residual    2.08574
Response    2.00000
dtype: float64
Residual    0.224209
Response    3.000000
dtype: float64
Residual    0.063303
Response    4.000000
dtype: float64
Residual   -0.31906
Response    5.00000
dtype: float64
Residual   -0.487037
Response    6.000000
dtype: float64
Residual   -0.811857
Response    7.000000
dtype: float64
Residual   -0.947523
Response    8.000000
dtype: float64
