In [4]:
import pandas as pd
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [5]:
train_df = pd.read_csv("train.csv", low_memory=False)
target_df = pd.read_csv("test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [6]:
train_df = train_df[['id','precio']]
features = pd.merge(train_df, data, on='id', how='inner')
features = features.fillna(0) #explorar mejores opciones
features.head()

Unnamed: 0,id,precio,days_to_today,año,antiguedad,centroscomercialescercanos,escuelascercanas,habitaciones,garages,banos,...,Quintana Roo,San luis Potosí,Sinaloa,Sonora,Tabasco,Tamaulipas,Tlaxcala,Veracruz,Yucatán,Zacatecas
0,254099,2273000.0,1540,2015,0.0,0.0,0.0,2.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,53461,3600000.0,2326,2013,10.0,1.0,1.0,3.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,247984,1200000.0,1485,2015,5.0,0.0,0.0,3.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,209067,650000.0,2802,2012,1.0,1.0,1.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,185997,1150000.0,1251,2016,10.0,0.0,0.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# target column
y_target = features.precio

# drop 'precio' column
features = features.drop(['id','precio'], axis=1)
features.shape

(240000, 79)

In [8]:
# test size
size_for_test = 0.25

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, y_target, test_size=size_for_test)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(180000, 79) (180000,)
(60000, 79) (60000,)


In [10]:
parameters = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb = XGBRegressor(objective='reg:squarederror')

In [11]:
xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)


In [12]:
xgb_grid.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed: 24.0min finished
  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:squarederror',
                                    random_st...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.03, 0.05, 0.07],
                         'max_depth

In [13]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

0.7964299728029175
{'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}


In [14]:
preds = xgb_grid.predict(X_test)

In [15]:
df_pred = pd.DataFrame(preds)
df_pred.head()

Unnamed: 0,0
0,3800328.0
1,2040414.0
2,1889031.0
3,856327.9
4,774040.4


In [16]:
print("Score:", xgb_grid.score(X_test, y_test))

Score: 0.808047849394911


In [17]:
target_df = target_df[['id']]

features_to_pred = pd.merge(target_df, data, on='id', how='inner')
features_to_pred = features_to_pred.fillna(0) #explorar mejores opciones

ids = features_to_pred["id"]

features_to_pred = features_to_pred.drop(['id'], axis=1)

features_to_pred.shape #deberia tener 79 columnas


(60000, 79)

In [18]:
target_prediction = xgb_grid.predict(features_to_pred)
df_predictions = pd.DataFrame(target_prediction)

In [19]:
df_predictions["id"] = ids
df_predictions.rename(columns = {0: 'target'},inplace = True) 
df_predictions = df_predictions[['id','target']]
df_predictions.shape

(60000, 2)

In [20]:
df_predictions.to_csv('ResultadosXGB_rob.csv',index=False)

In [25]:
target_lower_bound = 10000
target_upper_bound = 200000

def adjust_target_values(target_value, min_val):
    if (target_value >= target_upper_bound):
        return target_value
    correction = (target_value - min_val)/(target_upper_bound - min_val)*(target_upper_bound - target_lower_bound)
    return target_lower_bound + correction
    
if (df_predictions.target.min() < target_lower_bound):
    min_val = df_predictions['target'].min()
    df_predictions['target_corr'] = df_predictions.apply(lambda x: adjust_target_values(x['target'], min_val), axis=1)

In [29]:
df_predictions = df_predictions.drop(['target_corr'], axis=1)

KeyError: "['target_corr'] not found in axis"

In [32]:
df_predictions.to_csv('ResultadosXGB_with_bounds.csv',index=False)