In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from math import sqrt
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
%matplotlib inline

In [49]:
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [50]:
train_df = train_df[['id','precio']]
train_df['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [51]:
data.shape

(300000, 16)

In [52]:
test_df.shape

(60000, 22)

In [53]:
train_df.shape

(240000, 2)

In [54]:
features = pd.merge(train_df, data, on='id', how='inner')
features = features.fillna(0)
features['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [55]:
labels = features['precio']
features = features.drop(['id','precio'], axis=1)


In [56]:
parameters = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb = xgb.XGBRegressor(objective='reg:squarederror')

In [57]:
xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)


In [58]:
xgb_grid.fit(features, labels)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed: 41.8min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:squarederror',
                                    random_st...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.03, 0.05, 0.07],
                         'max_depth

In [59]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

0.5470122446736394
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}


In [66]:
###########

In [67]:
set_test = pd.merge(data, test_df[['id']],on = "id", how = "inner")
ids = set_test["id"]
set_test = set_test.drop(columns=['id'])

In [68]:
set_test = set_test.fillna(0)

In [69]:
set_test.head()

Unnamed: 0,habitaciones,garages,banos,anio_x,gimnasio,usosmultiples,piscina,anio_y,qty_of_extras,metroscubiertos,metrostotales,popular_desc_1,popular_desc_2,popular_desc_3,popular_desc_4
0,3.0,0.0,4.0,2013,0.0,0.0,0.0,2013,0.0,300.0,0.0,1,0,0,0
1,1.0,1.0,1.0,2015,0.0,0.0,0.0,2015,0.0,67.0,67.0,0,1,0,0
2,2.0,1.0,2.0,2015,0.0,0.0,0.0,2015,0.0,87.0,100.0,0,0,1,0
3,2.0,2.0,2.0,2015,0.0,0.0,0.0,2015,0.0,86.0,86.0,0,0,0,0
4,2.0,1.0,1.0,2013,0.0,0.0,0.0,2013,0.0,80.0,76.0,1,1,0,0


In [70]:
preds_kaggel_knn = xgb_grid.predict(set_test)

In [71]:
df_kaggel_knn = pd.DataFrame(preds_kaggel_knn)
df_kaggel_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 1 columns):
0    60000 non-null float32
dtypes: float32(1)
memory usage: 234.5 KB


In [72]:
df_kaggel_knn = pd.DataFrame(df_kaggel_knn)

In [73]:
df_kaggel_knn.head()

Unnamed: 0,0
0,5302279.5
1,1561060.875
2,1760257.25
3,2291579.5
4,798095.25


In [74]:
# df_kaggel_knn = df_kaggel_knn.to_frame()# df_kagg 
df_kaggel_knn["id"] = ids
df_kaggel_knn.rename(columns = {0: 'target'},inplace = True) 
df_kaggel_knn = df_kaggel_knn[['id','target']]

In [75]:
df_kaggel_knn.head()

Unnamed: 0,id,target
0,4941,5302279.5
1,51775,1561060.875
2,115253,1760257.25
3,299321,2291579.5
4,173570,798095.25


In [76]:
df_kaggel_knn.to_csv('ResultadosXGB.csv',index=False)

In [77]:
df_kaggel_knn.shape

(60000, 2)

In [78]:
df_kaggel_knn.head()

Unnamed: 0,id,target
0,4941,5302279.5
1,51775,1561060.875
2,115253,1760257.25
3,299321,2291579.5
4,173570,798095.25
