In [1]:
import numpy as np 
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape, test.shape)

(4209, 378) (4209, 377)


In [2]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.preprocessing import LabelEncoder

for c in train.columns:
    if train[c].dtype == 'object':
        le = LabelEncoder() 
        le.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = le.transform(list(train[c].values))
        test[c] = le.transform(list(test[c].values))

In [5]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,24,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,24,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,24,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
seed = 0
n_comp = 12

pca = PCA(n_components=n_comp, random_state=seed)
train_pca = pca.fit_transform(train.drop(['y'], axis=1))
test_pca = pca.transform(test)

ica = FastICA(n_components=n_comp, random_state=seed)
train_ica = ica.fit_transform(train.drop(['y'], axis=1))
test_ica = ica.transform(test)

tsvd = TruncatedSVD(n_components=n_comp, random_state=seed)
train_tsvd = tsvd.fit_transform(train.drop(['y'], axis=1))
test_tsvd = tsvd.transform(test)

In [7]:
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=seed)
train_grp = grp.fit_transform(train.drop(['y'], axis=1))
test_grp = grp.transform(test)

srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=seed)
train_srp = srp.fit_transform(train.drop(['y'], axis=1))
test_srp = srp.transform(test)

In [8]:
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = train_pca[:, i-1]
    test['pca_' + str(i)] = test_pca[:, i-1]
    
    train['ica_' + str(i)] = train_ica[:, i-1]
    test['ica_' + str(i)] = test_ica[:, i-1]

    train['tsvd_' + str(i)] = train_tsvd[:, i-1]
    test['tsvd_' + str(i)] = test_tsvd[:, i-1]
    
    train['grp_' + str(i)] = train_grp[:, i-1]
    test['grp_' + str(i)] = test_grp[:, i-1]
    
    train['srp_' + str(i)] = train_srp[:, i-1]
    test['srp_' + str(i)] = test_srp[:, i-1]
    
y_train = train['y']
y_mean = np.mean(y_train)

In [10]:
import xgboost as xgb

param = {'n_trees': 520, 
         'eta': 0.005, 
         'max_depth': 4, 
         'subsample': 0.925, 
         'objective': 'reg:linear', 
         'eval_metric': 'rmse', 
         'base_score': y_mean, 
         'silent': 0}

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

cv_result = xgb.cv(param, dtrain, num_boost_round=1000, 
                   early_stopping_rounds=50, verbose_eval=50, show_stdv=False)

num_boost_rounds = len(cv_result)

[0]	train-rmse:12.6399	test-rmse:12.6381
[50]	train-rmse:11.0904	test-rmse:11.1518
[100]	train-rmse:10.0162	test-rmse:10.1456
[150]	train-rmse:9.28505	test-rmse:9.48665
[200]	train-rmse:8.79507	test-rmse:9.06795
[250]	train-rmse:8.467	test-rmse:8.80643
[300]	train-rmse:8.24589	test-rmse:8.64743
[350]	train-rmse:8.07822	test-rmse:8.55384
[400]	train-rmse:7.92892	test-rmse:8.49887
[450]	train-rmse:7.79216	test-rmse:8.46616
[500]	train-rmse:7.67368	test-rmse:8.44768
[550]	train-rmse:7.56585	test-rmse:8.43664
[600]	train-rmse:7.46033	test-rmse:8.43293
[650]	train-rmse:7.36568	test-rmse:8.43127
[700]	train-rmse:7.28167	test-rmse:8.43299


In [11]:
from sklearn.metrics import r2_score

bst = xgb.train(dict(param), dtrain, num_boost_round=num_boost_rounds)

print(round(r2_score(dtrain.get_label(), bst.predict(dtrain)), 2))

0.64


In [12]:
y_pred = bst.predict(dtest)

output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('submission.csv', index=False)