In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

In [3]:
# read datasets
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [4]:
    from sklearn.decomposition import PCA, FastICA
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)


In [18]:
()# mmm, xgboost, loved by everyone ^-^
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 800, 
    'eta': 0.005,
    'max_depth': 6,
    'subsample': 0.80,
    'lambda':2,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=2000, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )
print cv_result
num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

[0]	train-rmse:12.6391	test-rmse:12.6385
[50]	train-rmse:11.0344	test-rmse:11.1705
[100]	train-rmse:9.87875	test-rmse:10.1732
[150]	train-rmse:9.05326	test-rmse:9.51649
[200]	train-rmse:8.46014	test-rmse:9.09636
[250]	train-rmse:8.03236	test-rmse:8.83399
[300]	train-rmse:7.71625	test-rmse:8.67202
[350]	train-rmse:7.46881	test-rmse:8.57553
[400]	train-rmse:7.27244	test-rmse:8.51903
[450]	train-rmse:7.09905	test-rmse:8.48881
[500]	train-rmse:6.94498	test-rmse:8.47294
[550]	train-rmse:6.80083	test-rmse:8.46735
[600]	train-rmse:6.67723	test-rmse:8.46464
[650]	train-rmse:6.56854	test-rmse:8.46462
     test-rmse-mean  test-rmse-std  train-rmse-mean  train-rmse-std
0         12.638462       0.308924        12.639098        0.154292
1         12.603389       0.309063        12.601632        0.153687
2         12.568766       0.309246        12.564341        0.154066
3         12.534529       0.309728        12.527144        0.153869
4         12.500232       0.309396        12.490381        0.

In [15]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

# now fixed, correct calculation
print(r2_score(dtrain.get_label(), model.predict(dtrain)))


0.684710294675


In [16]:
# make predictions and save results
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgboost-depth{}-pca-ica.csv'.format(xgb_params['max_depth']), index=False)
