#### Varios modelos

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [4]:
test = pd.read_csv("input/test.csv")
train = pd.read_csv("input/train.csv")

In [5]:
columnas = [a for a in test.columns]

In [6]:
Xb = train[columnas]
yb = train["price"]
X_train, X_test, y_train, y_test = train_test_split(Xb, yb, test_size=0.2)

In [7]:
models = {
    "linealReg": LinearRegression(),
    "forest100": RandomForestRegressor(n_estimators=100),
    "forest200": RandomForestRegressor(n_estimators=200),
    "tree": DecisionTreeRegressor(random_state=0),
    "neigbor":KNeighborsRegressor(),
    "boosting": GradientBoostingRegressor(n_estimators=500),
}

In [8]:
for name,m  in models.items():
    print(f"Training {name}...")
    m.fit(X_train, y_train)
print("Train complete")

Training linealReg...
Training forest100...
Training forest200...
Training tree...
Training neigbor...
Training boosting...
Train complete


In [9]:
from sklearn.metrics import mean_squared_error

printMetric = lambda label,value:print(f"\t {label}: {round(value,3)}")

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",mean_squared_error(y_test, y_pred))
print("Train complete")

Evaluating model linealReg
	 RMSE: 1221053.226
Evaluating model forest100
	 RMSE: 297874.783
Evaluating model forest200
	 RMSE: 298933.23
Evaluating model tree
	 RMSE: 584161.675
Evaluating model neigbor
	 RMSE: 12983993.66
Evaluating model boosting
	 RMSE: 338345.162
Train complete


In [10]:
rf_reg = RandomForestRegressor(n_estimators=200)
rf_reg.fit(X_train, y_train)
rf_reg.feature_importances_
print(rf_reg.score(X_train, y_train))
print(rf_reg.score(X_test, y_test))

0.997214437786311
0.9804366053304029


In [19]:
rmse1 = np.sqrt(mean_squared_error(y_test, rf_reg.predict(X_test)))
rmse1

546.2641832510901

In [11]:
test['price'] = rf_reg.predict(test[columnas])

In [12]:
columnasentrega = ["id","price"]
entrega = test[columnasentrega]

In [13]:
entrega

Unnamed: 0,id,price
0,0,4665.270
1,1,1008.860
2,2,17194.585
3,3,6956.145
4,4,1526.305
...,...,...
13444,13444,676.850
13445,13445,2581.720
13446,13446,510.385
13447,13447,703.070


In [14]:
grad = GradientBoostingRegressor(n_estimators=500)
grad.fit(X_train, y_train)
grad.feature_importances_
print(grad.score(X_train, y_train))
print(grad.score(X_test, y_test))

0.9818848106131669
0.9778551441235982


In [18]:
rmse = np.sqrt(mean_squared_error(y_test, grad.predict(X_test)))
rmse

581.1885464884522

In [57]:
#entrega.to_csv('output/predict15.csv', header=True, index=False)