In [1]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn import cross_validation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import  RandomForestRegressor,RandomForestClassifier
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA

In [2]:
def eval_gini(preds, dtrain):
    true = dtrain.get_label()    
    gini = Gini(true, preds)
    gini *= -1
    return 'gini', gini

In [3]:
def one_hot_encode(data, features):
    for feature in features:
        le = LabelEncoder()
        onehot = OneHotEncoder(sparse=False)
        data[feature] = le.fit_transform(data[feature])
        labels = le.classes_
        hot = onehot.fit_transform(np.array(data[[feature]]))
        temp = pd.DataFrame(hot, columns=[feature + '_' + str(label) for label in labels],index = data[feature].index)
        if data[feature].dtype == object:
            data.drop(feature, axis=1, inplace=True)
        data = pd.concat([data,temp],axis =1,join_axes=[data.index])
    return data

In [4]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true


In [5]:
def xgboost_pred_without_offset(train,labels,test):
    params = {}
    params["objective"] = "count:poisson"
    params["eta"] = 0.005
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 1
    params["scale_pos_weight"] = 1
    params["silent"] = 1
    params["max_depth"] = 6
    params["seed"] = 42

    plst = list(params.items())

    num_rounds = 1800
    xgtest = xgb.DMatrix(test)

    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(train, label=labels)

    model = xgb.train(plst, xgtrain, num_rounds)
    preds = model.predict(xgtest)
    return preds


In [6]:
def ensemble(X_train, y_train, X_test):
    threshold = 3
    y_binar = binar_xgboost_pred(X_train,y_train<=threshold,X_test)

    n_of_chunks = 5
    kf = cross_validation.KFold(len(X_train), n_of_chunks, random_state = 7)
    y_pred = np.zeros(len(X_test))
    for train_index, test_index in kf:
        y_pred += xgboost_pred_without_offset(X_train[train_index],y_train[train_index],X_test)
        

    y_pred = y_pred - y_binar*n_of_chunks*1.5
    return y_pred

In [7]:
def binar_xgboost_pred(X_train,y_train,X_test):
    param = {'objective':'binary:logistic' ,'eta':0.1, 'max_depth':2,'silent':1,
    'min_child_weight':25, 'scale_pos_weight':1.0, 'subsample':0.7, 'colsample_bytree':1}
    param['booster'] = 'gbtree'
    plst = list(param.items())
    num_rounds = 800
    xgtest = xgb.DMatrix(X_test)
    #labels = (y_train >= lb)*(y_train <= rb)
    xgtrain = xgb.DMatrix(X_train, label = y_train)
    model = xgb.train(plst, xgtrain, num_rounds)
    y_pred = model.predict(xgtest)
    return y_pred

In [8]:
def cross_val(X,y,n_folds = 3):
    n_elems = len(X)
    kf = cross_validation.KFold(n_elems, n_folds)
    predictions = []
    answers = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        y_pred = ensemble(X_train, y_train, X_test)
        predictions.extend(y_pred)
        answers.extend(y_test)
    return Gini(np.array(answers), np.array(predictions))

In [9]:
train_load  = pd.read_csv('train.csv', index_col=0)
test_load  = pd.read_csv('test.csv', index_col=0)

labels = train_load.Hazard
train_load.drop('Hazard', axis=1, inplace=True)

In [10]:
train_s = train_load.copy()
test_s = test_load.copy()


columns = train_load.columns
test_ind = test_load.index

In [11]:
#data = train_s.copy()
train_load['is_train'] = 1
test_load['is_train'] = 0
data = pd.concat([train_load, test_load]).copy()

In [12]:
cat_features = ['T1_V4','T1_V5','T1_V6','T1_V7','T1_V8','T1_V9','T1_V11',
                'T1_V12','T1_V15','T1_V16','T1_V17','T2_V3','T2_V5','T2_V11','T2_V12','T2_V13']
data = one_hot_encode(data, cat_features)

In [13]:
drop_features = ['T1_V15_N','T1_V16_I', 'T2_V12', 'T1_V12', 'T2_V5_D', 'T2_V3', 'T1_V4_G', 'T2_V5_C', 'T1_V16_D', 'T1_V11_M', 
 'T1_V4_E', 'T1_V11_D', 'T1_V6', 'T1_V16_O', 'T1_V5_D', 'T1_V11_K', 'T1_V16_M', 'T1_V16_N', 'T1_V5_H', 
 'T1_V15_H', 'T1_V15_D', 'T2_V7', 'T2_V13_D', 'T1_V9_D', 'T1_V16_Q', 'T1_V4_S', 'T2_V13_C', 'T2_V5_E', 'T1_V16_L', 
 'T1_V11_F', 'T1_V16_G', 'T2_V3_N',  'T1_V15_A', 'T2_V5_B', 'T2_V13_E', 'T2_V5_A', 'T1_V15_W', 'T2_V3_Y',
 'T1_V16_A', 'T1_V16_B', 'T2_V12_N', 'T2_V11_Y', 'T1_V17_Y', 'T2_V13_B', 'T2_V13_A', 'T2_V12_Y', 'T1_V17_N', 'T2_V5_F', 
 'T1_V16_R', 'T2_V11_N', 'T1_V15_C', 'T1_V11_A', 'T1_V12_D', 'T1_V9_C', 'T1_V8_D', 'T1_V8_B', 'T1_V12_B', 'T1_V7_D',
 'T1_V7_B', 'T1_V7_A', 'T1_V6_Y', 'T1_V6_N', 'T1_V5_L', 'T1_V5_E', 'T1_V5_B', 'T1_V5_A', 'T1_V4_W', 'T1_V4_C', 'T1_V4_B', 
 'T1_V9_B', 'T1_V8_A', 'T1_V9_G', 'T1_V11_B', 'T1_V11_N', 'T1_V12_A', 'T1_V5_J', 'T1_V5_C', 'T2_V10', 'T1_V15_S', 'T1_V11_H',
 'T1_V16_C', 'T1_V16_H', 'T1_V9_F', 'T1_V8', 'T1_V16_E', 'T1_V16_J', 'T1_V15_F', 'T1_V11_L', 'T1_V5_I',
 'T1_V16_F', 'T1_V5_K', 'T1_V9_E', 'T1_V11_I', 'T1_V11_J', 'T1_V13', 'T1_V10', 'T1_V7', 'T1_V16_K']


for feat in drop_features:
    data.drop(feat,axis=1, inplace=True)

In [14]:
train = data[data['is_train'] == 1]
test = data[data['is_train'] == 0]
train.drop('is_train',axis=1, inplace=True)
test.drop('is_train',axis=1, inplace=True)
X_train = np.array(train)
X_test= np.array(test)
#X_train = np.append(X_train,np.reshape(X_train[:,0]*X_train[:,1],(len(X_train),1)),axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
print(cross_val(X_train,np.array(labels), 5))

In [88]:
y_pred = ensemble(X_train, np.array(labels), X_test)

preds = pd.DataFrame({"Id": test_ind, "Hazard": y_pred})
preds = preds.set_index('Id')
preds.to_csv('xgboost_ens_submit.csv')