In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostRegressor

In [2]:
# Reading data
train=pd.read_csv('../Data/train.csv')
test=pd.read_csv('../Data/test.csv')

In [3]:
# Getting list of categorical and numerical columns
cat_cols=list(train.columns[train.dtypes=='object'])
num_cols=list(train.columns[train.dtypes!='object'].drop(['hours-per-week']))

In [4]:
train_Y=train['hours-per-week']
train_X=train[num_cols+cat_cols]
train_X[cat_cols]=train_X[cat_cols].astype('str')

test_X=test[num_cols+cat_cols]
test_X[cat_cols]=test_X[cat_cols].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [5]:
params_cb={'cat_features':cat_cols,'random_seed':123,'n_estimators':1000,'learning_rate':0.05}

# Doing a 5 fold stratified sampling and running catboost model
fold=StratifiedKFold(n_splits=5, shuffle=True, random_state=12345)
pred_cb,cb_scores=[],[]
for idxT, idxV in fold.split(train_X, train_Y):
    X_train, X_test = train_X.iloc[idxT], train_X.iloc[idxV]
    y_train, y_test = train_Y.iloc[idxT], train_Y.iloc[idxV]
    
    cb=CatBoostRegressor(**params_cb,early_stopping_rounds=50,eval_metric='RMSE')
    cb.fit(X_train, y_train,eval_set=(X_test,y_test),plot=False, verbose=500)
    cb_scores.append(cb.get_best_score().get('validation').get('RMSE'))
    pred_cb.append(cb.predict(test_X))



0:	learn: 11.0213992	test: 10.9869885	best: 10.9869885 (0)	total: 226ms	remaining: 3m 45s
500:	learn: 9.2362350	test: 9.5314374	best: 9.5252996 (453)	total: 35.7s	remaining: 35.5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 9.525299601
bestIteration = 453

Shrink model to first 454 iterations.
0:	learn: 11.0100532	test: 11.0138702	best: 11.0138702 (0)	total: 67ms	remaining: 1m 6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 9.773388011
bestIteration = 311

Shrink model to first 312 iterations.
0:	learn: 11.0181916	test: 10.9833822	best: 10.9833822 (0)	total: 59.7ms	remaining: 59.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 9.644502908
bestIteration = 235

Shrink model to first 236 iterations.
0:	learn: 11.0010227	test: 11.0283579	best: 11.0283579 (0)	total: 66.6ms	remaining: 1m 6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 9.710133567
bestIteration = 353

Shrink model to first 354 iterations.
0:	l

In [6]:
#Inverting the scores to get weights as lower score is better
weights=1/np.array(cb_scores)
weights=weights/sum(weights)
print ('The Local CV is {}'.format(sum(weights*cb_scores)))

The Local CV is 9.657757502689186


In [7]:
# Creating submission file
pd.DataFrame({'hours-per-week':np.dot(weights,pred_cb)}).to_csv('submit.csv',index=False)