In [1]:
import numpy as np
import pandas as pd 
from sklearn import preprocessing

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

#apply lableEncoder 
for line in train_df.columns:
    if train_df[line].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[line].values) + list(test_df[line].values))
        train_df[line] = lbl.transform(train_df[line].values)
        test_df[line] = lbl.transform(list(test_df[line].values))
        
print('Shape train: {}\nShape test: {}'.format(train_df.shape, test_df.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [4]:
from sklearn.decomposition import PCA, FastICA
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train_df.drop(["y"], axis=1))
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train_df.drop(["y"], axis=1))
ica2_results_test = ica.transform(test_df)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train_df['pca_' + str(i)] = pca2_results_train[:,i-1]
    test_df['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train_df['ica_' + str(i)] = ica2_results_train[:,i-1]
    test_df['ica_' + str(i)] = ica2_results_test[:, i-1]
    

y_train = train_df["y"]
y_mean = np.mean(y_train)

In [9]:
#from lightgbm import LGBMRegressor

In [5]:
# prepare dict of params for xgboost to run with
import xgboost as xgb
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 5,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train_df.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test_df)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=500, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

[0]	train-rmse:12.6392	test-rmse:12.6384
[50]	train-rmse:11.0519	test-rmse:11.1507
[100]	train-rmse:9.93076	test-rmse:10.147
[150]	train-rmse:9.13745	test-rmse:9.49156
[200]	train-rmse:8.58676	test-rmse:9.07459
[250]	train-rmse:8.20695	test-rmse:8.81136
[300]	train-rmse:7.93971	test-rmse:8.64864
[350]	train-rmse:7.73873	test-rmse:8.54933
[400]	train-rmse:7.5734	test-rmse:8.4934
[450]	train-rmse:7.41972	test-rmse:8.46305
[499]	train-rmse:7.28949	test-rmse:8.44741
500


In [6]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

# now fixed, correct calculation
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

0.643508711458


In [50]:
# make predictions and save results
dtest = xgb.DMatrix(dtest)
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test_df['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgboost-newfeature-pca-ica.csv', index=False)