In [1]:
import pandas as pd
import numpy as np

data_csv = pd.read_csv("train.csv")
test_csv = pd.read_csv("test.csv")
test_id = test_csv["Id"]
size = test_csv.shape
all_data = pd.concat((test_csv,data_csv),sort=False)

#all_data.dropna(axis="columns",inplace=True)
###
all_data.drop(['SalePrice','Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
               'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
               'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
              axis=1, inplace=True)
# MSSubClass as str
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)

# MSZoning NA in pred. filling with most popular values
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

# LotFrontage  NA in all. I suppose NA means 0
all_data['LotFrontage'] = all_data['LotFrontage'].fillna(all_data['LotFrontage'].mean())

# Alley  NA in all. NA means no access
all_data['Alley'] = all_data['Alley'].fillna('NOACCESS')

# Converting OverallCond to str
all_data.OverallCond = all_data.OverallCond.astype(str)

# MasVnrType NA in all. filling with most popular values
all_data['MasVnrType'] = all_data['MasVnrType'].fillna(all_data['MasVnrType'].mode()[0])

# BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
# NA in all. NA means No basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('NoBSMT')

# TotalBsmtSF  NA in pred. I suppose NA means 0
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'].fillna(0)

# Electrical NA in pred. filling with most popular values
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

# KitchenAbvGr to categorical
all_data['KitchenAbvGr'] = all_data['KitchenAbvGr'].astype(str)

# KitchenQual NA in pred. filling with most popular values
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

# FireplaceQu  NA in all. NA means No Fireplace
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('NoFP')

# GarageType, GarageFinish, GarageQual  NA in all. NA means No Garage
for col in ('GarageType', 'GarageFinish', 'GarageQual'):
    all_data[col] = all_data[col].fillna('NoGRG')

# GarageCars  NA in pred. I suppose NA means 0
all_data['GarageCars'] = all_data['GarageCars'].fillna(0.0)

# SaleType NA in pred. filling with most popular values
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

# Year and Month to categorical
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

# Adding total sqfootage feature and removing Basement, 1st and 2nd floor all_data
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)

###
all_data = pd.get_dummies(all_data,drop_first=True)

test = all_data[0:size[0]]
data = all_data[size[0]:]

x = data
y = data_csv["SalePrice"]

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer


train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=1/3)

In [3]:
# ARBOLES DE DECISION

from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(criterion="mse",splitter="best",max_depth=None,min_samples_split=10,min_samples_leaf=5)
tree.fit(train_x,train_y)
predict_y = tree.predict(test_x)

In [4]:
1-np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]

0.47022587268993843

In [61]:
# K-MEDIAS

from sklearn.neighbors import KNeighborsRegressor

knr = KNeighborsRegressor(n_neighbors=7,p=1,metric="minkowski")
knr.fit(train_x,train_y)
predict_y = knr.predict(test_x)

1- np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]

0.5790554414784395

In [62]:
# Regresion Lineal

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(train_x,train_y)
predict_y = reg.predict(test_x)

1 - np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]

0.4065708418891171

In [63]:
# SVM
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

svr = SVR(kernel="linear", degree=3, gamma="scale", max_iter=-1)
svr.fit(train_x,train_y)
predict_y = svr.predict(test_x)

1 - np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]

0.5112936344969199

In [64]:
# Perceptron Multicapa
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_x)
train_x_ss = ss.transform(train_x)
test_x_ss = ss.transform(test_x)

mlp = MLPRegressor(hidden_layer_sizes=(100,100,100),activation="relu",solver="lbfgs",max_iter=50000)
mlp.fit(train_x_ss, train_y)
predict_y = mlp.predict(test_x_ss)

1 - np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]

0.4004106776180698

In [53]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

rand = RandomForestRegressor(n_estimators=100,max_depth=None,min_samples_split=10,max_features=25)
rand.fit(train_x,train_y)
predict_y = rand.predict(test_x)

1 - np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]

0.34086242299794656

In [76]:
# xgboost
import xgboost as xgb

xgb_model = xgb.XGBRegressor(random_state=0, max_depth=5, n_estimators=200, booster="gbtree", n_jobs=4)
xgb_model.fit(train_x, train_y)
predict_y = xgb_model.predict(test_x)
1 - np.sum(abs(predict_y - test_y) < 0.1*test_y)/test_y.shape[0]



0.32854209445585214

In [77]:
# PREDECIR

xgb_model.fit(x,y)

test_out = xgb_model.predict(test)
out = pd.DataFrame({"Id" : test_id, "SalePrice" : test_out})
out.to_csv("out.csv",index=False)

  data.base is not None and isinstance(data, np.ndarray) \


