In [63]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [64]:
data = pd.read_csv("../input/home-data-for-ml-course/train.csv", index_col="Id")
data_test = pd.read_csv("../input/home-data-for-ml-course/test.csv", index_col="Id")

data.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = data.SalePrice
x = data.drop(["SalePrice"], axis=1)

In [65]:
from sklearn.model_selection import train_test_split
x_train_full, x_valid_full, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=0)

In [66]:
numeric_cols = [col for col in x_train_full.columns if x_train_full[col].dtype in ["int64", "float64"]]
categorical_cols = [col for col in x_train_full.columns if x_train_full[col].dtype == "object" and x_train_full[col].nunique() < 10]

cols = numeric_cols + categorical_cols
x_train = x_train_full[cols]
x_valid = x_valid_full[cols]

In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("oh", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [68]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [69]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [70]:
from sklearn.metrics import mean_absolute_error

pipeline.fit(x_train, y_train)
preds = pipeline.predict(x_valid)
score = mean_absolute_error(y_valid, preds)
print(score)

In [72]:
test_preds = pipeline.predict(data_test)
output = pd.DataFrame({"Id": data_test.index, "SalePrice": test_preds})
output.to_csv("zz.csv", index=False)