In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

X.drop(["Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"], inplace=True, axis=1)
X_test_full.drop(["Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"], inplace=True, axis=1)


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 7 and X[cname].dtype == "object"]


# Select numeric columns
numeric_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


# Keep selected columns only
my_cols = numeric_cols + categorical_cols
X = X[my_cols].copy()
X_test = X_test_full[my_cols].copy()


In [15]:
X.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [16]:
NaN = X.isnull().sum()

In [17]:
import sklearn.metrics
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import numpy as np

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    
                           ('imputer', SimpleImputer(strategy='most_frequent')),
                           ('onehot', OneHotEncoder(handle_unknown='ignore'))
 ])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [18]:
X = pd.DataFrame(preprocessor.fit_transform(X))
X_test_final = pd.DataFrame(preprocessor.transform(X_test))

In [19]:
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor

modelo = XGBRegressor(objective='reg:squarederror', eval_metric='mae', random_state=0, 
    n_estimators= 550, learning_rate = 0.0495, n_jobs=2, colsample_bytree=0.3, subsample=0.6, verbosity=1)# Your code here

selector = RFE(estimator=modelo , n_features_to_select= 107)
selector.fit(X,y)
print(selector.ranking_)
columnasfinales = X.columns[selector.support_]

KeyboardInterrupt: 

In [7]:
X_test_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,155,156,157,158,159,160,161,162,163,164
0,20.0,80.0,11622.0,5.0,6.0,1961.0,1961.0,0.0,468.0,144.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20.0,81.0,14267.0,6.0,6.0,1958.0,1958.0,108.0,923.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60.0,74.0,13830.0,5.0,5.0,1997.0,1998.0,0.0,791.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60.0,78.0,9978.0,6.0,6.0,1998.0,1998.0,20.0,602.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,120.0,43.0,5005.0,8.0,5.0,1992.0,1992.0,0.0,263.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160.0,21.0,1936.0,4.0,7.0,1970.0,1970.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1455,160.0,21.0,1894.0,4.0,5.0,1970.0,1970.0,0.0,252.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,20.0,160.0,20000.0,5.0,7.0,1960.0,1996.0,0.0,1224.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,85.0,62.0,10441.0,5.0,5.0,1992.0,1992.0,0.0,337.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
# for column in X.columns:
    
#     X[column] = X[column]/X[column].max()
    
#     divisor2 = X_test_final[column].max()
    
#     if divisor2 == 0:
        
#         divisor2 = 1
    
#     X_test_final[column] = X_test_final[column]/divisor2
    

In [9]:
# X_test_final

In [10]:
X = X[columnasfinales]
X_test_final = X_test_final[columnasfinales]

NameError: name 'columnasfinales' is not defined

In [11]:

from sklearn.model_selection import cross_val_score

def CrossVal (n_estimators, cv, learning_rate, X, y, max_depth, colsample_bytree,subsample):
    
   #Define the model
    
    my_model = XGBRegressor(objective='reg:squarederror', eval_metric='mae', random_state=0, 
    n_estimators= n_estimators, learning_rate = learning_rate, n_jobs=2, colsample_bytree=colsample_bytree, subsample=subsample, verbosity=1) # Your code here
    
    MAE = -1 * cross_val_score(my_model, X, y, cv=cv, scoring='neg_mean_absolute_error') #Acá pongo X e y enteras porque estoy en la funcion de crossvalidation. SOLITA va a tomar una parte para test y otra para train.
    
    prom_MAE = MAE.mean()
    
    return prom_MAE

results = {}

for estimators in [550]:
    for cv in [5]:
        for learning in [0.0495]:
            for maxdepth in [2,4]:
                for colsample in [0.3]:
                    for subsample in [0.6]:
            
                        results["N_est "+str(estimators)+"/ Cv "+str(cv)+"/ Learning_rate "+str(learning)+ "/ Maxdepth:" + str(maxdepth) + "/ Colsample:" +str(colsample) + "/ Subsample: " + str(subsample)  ] = CrossVal(estimators,cv,learning, X, y, maxdepth, colsample , subsample)

results

{'N_est 550/ Cv 5/ Learning_rate 0.0495/ Maxdepth:2/ Colsample:0.3/ Subsample: 0.6': 15444.614051797946,
 'N_est 550/ Cv 5/ Learning_rate 0.0495/ Maxdepth:4/ Colsample:0.3/ Subsample: 0.6': 15444.614051797946}

In [12]:
modelo = XGBRegressor(objective='reg:squarederror', eval_metric='mae', random_state=0, 
    n_estimators= 550, learning_rate = 0.0495, n_jobs=2, colsample_bytree=0.3, subsample=0.6, verbosity=1)# Your code here

#Fit the model
modelo.fit(X, y, verbose=False) #Armo el modelo con toda la data 



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, eval_metric='mae',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.0495, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=550, n_jobs=2,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.6, tree_method='exact',
             validate_parameters=1, verbosity=1)

In [13]:
preds_test = modelo.predict(X_test_final)


# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index, #Importante que sea el del X_test original pq tiene el index original.
                       'SalePrice': preds_test})
output.to_csv('XGBSubmission.csv', index=False)