In [3]:
import numpy as np
import pandas as pd 
from sklearn import preprocessing

In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

#apply lableEncoder 
for line in train_df.columns:
    if train_df[line].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[line].values) + list(test_df[line].values))
        train_df[line] = lbl.transform(train_df[line].values)
        test_df[line] = lbl.transform(list(test_df[line].values))
        
print('Shape train: {}\nShape test: {}'.format(train_df.shape, test_df.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [6]:
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train_df.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test_df)

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train_df.drop(["y"], axis=1))
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train_df.drop(["y"], axis=1))
ica2_results_test = ica.transform(test_df)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_df.drop(["y"], axis=1))
grp_results_test = grp.transform(test_df)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train_df.drop(["y"], axis=1))
srp_results_test = srp.transform(test_df)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
    test_df['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
    test_df['ica_' + str(i)] = ica2_results_test[:, i-1]
    
    train_df['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    test_df['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

    train_df['grp_' + str(i)] = grp_results_train[:,i-1]
    test_df['grp_' + str(i)] = grp_results_test[:, i-1]

    train_df['srp_' + str(i)] = srp_results_train[:,i-1]
    test_df['srp_' + str(i)] = srp_results_test[:, i-1]
    
y_train = train_df["y"]
y_mean = np.mean(y_train)

In [9]:
#from lightgbm import LGBMRegressor

In [11]:
# prepare dict of params for xgboost to run with
import xgboost as xgb
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 5,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train_df.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test_df)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=500, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

[0]	train-rmse:12.6392	test-rmse:12.6385
[50]	train-rmse:11.0454	test-rmse:11.1518
[100]	train-rmse:9.92207	test-rmse:10.1518
[150]	train-rmse:9.12168	test-rmse:9.50252
[200]	train-rmse:8.57265	test-rmse:9.09206
[250]	train-rmse:8.18533	test-rmse:8.84201
[300]	train-rmse:7.90555	test-rmse:8.68946
[350]	train-rmse:7.69676	test-rmse:8.59786
[400]	train-rmse:7.51412	test-rmse:8.54923
[450]	train-rmse:7.34928	test-rmse:8.52152
[499]	train-rmse:7.19838	test-rmse:8.50804
500


In [12]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

# now fixed, correct calculation
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

0.647016328891


In [10]:
# make predictions and save results

y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test_df['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgboost-1206-pca-ica.csv', index=False)